##
## H2O is not running yet, starting it now...
##
## Note: In case of errors look at the following log files:
## /var/folders/qw/2tnkb3b11dncn1d6lmqs7rh40000gn/T//RtmpuWuMJp/h2o_krishnaprasad_started_from_r.out
## /var/folders/qw/2tnkb3b11dncn1d6lmqs7rh40000gn/T//RtmpuWuMJp/h2o_krishnaprasad_started_from_r.err
##
##
## Starting H2O JVM and connecting: .. Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 1 seconds 854 milliseconds
## H2O cluster timezone: America/Denver
## H2O data parsing timezone: UTC
## H2O cluster version: 3.28.0.2
## H2O cluster version age: 1 month and 6 days
## H2O cluster name: H2O_started_from_R_krishnaprasad_tnc829
## H2O cluster total nodes: 1
## H2O cluster total memory: 4.00 GB
## H2O cluster total cores: 12
## H2O cluster allowed cores: 12
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: Amazon S3, XGBoost, Algos, AutoML, Core V3, TargetEncoder, Core V4
## R Version: R version 3.6.1 (2019-07-05)
# Read Classification dataset from RDS
class.data <- readRDS("KDEN_Class_Data_New.RDS")
# Reorder data frame by Date
class.data <- class.data[order(class.data$DATE), ]
class.data$PRCP_LAG_1 <- lag(class.data$PRCP, k = 1)
# class.data$TEMP_LAG_1 <- lag(class.data$TEMP, k = 1)
class.data <- class.data[complete.cases(class.data), ]
class.data <-
class.data[!(class.data$MXSPD == 999.9 |
class.data$PRCP == 99.99 | class.data$PRCP_LAG_1 == 99.99),]
class.data$FOG <- as.factor(class.data$FOG)
class.data$SNOW_ICE <- as.factor(class.data$SNOW_ICE)
class.data$STRIKE <-
as.factor(ifelse(class.data$STRIKE == 0, "NO", "YES"))
# one-hot-encoding categorical features
ohe_feats = c('MONTH')
# Create dummies
dummies <- dummyVars(~ MONTH, data = class.data)
df.dummies <- as.data.frame(predict(dummies, newdata = class.data))
# Merge Dummies to data frame
class.data <-
cbind(class.data[, -c(which(colnames(class.data) %in% ohe_feats))], df.dummies)
valid.cl.data <-
class.data[(class.data$YEAR == 2019), ]
class.data <-
class.data %>% filter(!class.data$YEAR %in% c(1995:2007, 2019))
class.data <- subset(class.data, select = -c(DATE, YEAR, SEASON, MXSPD, SNOW_ICE, MONTH.12, STRIKECOUNT))
valid.cl.data <- subset(valid.cl.data, select = -c(DATE, YEAR, SEASON, MXSPD, SNOW_ICE, MONTH.12, STRIKECOUNT))
# Create the training and test datasets
set.seed(100)
class.data$STRIKE <- as.factor(class.data$STRIKE)
# Step 1: Get row numbers for the training data
trainRowNumbers.cl <-
createDataPartition(class.data$STRIKE, p = 0.70, list = FALSE)
# Step 2: Create the training dataset
train.data <- class.data[trainRowNumbers.cl, ]
# Step 3: Create the test dataset
test.data <- class.data[-trainRowNumbers.cl, ]
validateAndPrintResult <- function(model, data) {
# Summarise Results
print(model)
## run MLeval
res <- evalm(model)
## get ROC
res$roc
## get calibration curve
res$cc
## get precision recall gain curve
res$prg
# Predict on testData
predicted.resp <- predict(model, data)
head(predicted.resp)
caret::confusionMatrix(
reference = as.factor(data$STRIKE),
data = predicted.resp,
mode = 'everything',
positive = 'YES'
)
}
# # 5 Fold cross validation with Probabilities
# tc <- trainControl(
# method = "cv",
# number = 5,
# savePredictions = "final",
# classProbs = TRUE,
# verboseIter = TRUE,
# summaryFunction = twoClassSummary
# )
# model.metrics.class <-
# data.frame(
# "Model_Name" = character(0),
# "Data_Type" = character(0),
# "AUC" = numeric(0),
# "Accuracy" = numeric(0),
# "Sensitivity" = numeric(0),
# "Specificity" = numeric(0),
# "Precision" = numeric(0),
# "Recall" = numeric(0),
# "Kappa" = numeric(0),
# "Confusion_Matrix" = character(0),
# stringsAsFactors = FALSE
# )
# saveModelMetrics <-
# function(model.predictions,
# data,
# model.name,
# data.type) {
#
# # auc
#
# # accuracy
#
# # sensitivity
#
# # specificity
#
# # precision
#
# # recall
#
# # kappa
#
# # confusionMatrix
#
#
# # combine all the above validation metrics
# model.summary <-
# c(model.name,
# data.type,
# auc,
# accuracy,
# sensitivity,
# specificity,
# precision,
# recall,
# kappa,
# confusionMatrix)
#
# return(model.summary)
# }
# X_train = sparse.model.matrix(as.formula(paste(
# "STRIKE ~", paste(colnames(train.data[, -11]), sep = "", collapse = " +")
# )), data = train.data)
#
# y_train <- as.factor(train.data[,11])
#
# X_test = sparse.model.matrix(as.formula(paste(
# "STRIKE ~", paste(colnames(test.data[, -11]), sep = "", collapse = " +")
# )), data = test.data)
#
# y_test <- as.factor(test.data[,11])
#
# X_val = sparse.model.matrix(as.formula(paste(
# "STRIKE ~", paste(colnames(valid.cl.data[, -11]), sep = "", collapse = " +")
# )), data = valid.cl.data)
#
# y_val <- as.factor(valid.cl.data[,11])
#
# # ELASTIC NET WITH 0 < ALPHA < 1
# a <- seq(0.1, 0.9, 0.05)
# search <- foreach(i = a, .combine = rbind) %dopar% {
# cv <-
# cv.glmnet(
# X_train,
# y_train,
# family = "binomial",
# nfold = 10,
# type.measure = "deviance",
# parallel = TRUE,
# alpha = i
# )
# data.frame(
# cvm = cv$cvm[cv$lambda == cv$lambda.1se],
# lambda.1se = cv$lambda.1se,
# alpha = i
# )
# }
# plot(search$lambda.1se)
# cv3 <- search[search$cvm == min(search$cvm), ]
#
#
# model.glmnet <-
# glmnet(
# X_train,
# y_train,
# family = "binomial",
# lambda = cv3$lambda.1se,
# alpha = cv3$alpha
# )
# coef(model.glmnet)
#
# model.glmnet
#
# summary(model.glmnet)
#
# preds <- predict(model.glmnet, X_test, type = "response")
#
# # Calculate true positive rate and false positive rate on the prediction object
# perf <- performance(prediction(preds, y_test), 'tpr', 'fpr')
# roc.auc.glmnet <- performance(prediction(preds, y_test), "auc")
# plot(perf, main = paste("ROC - Elastic Net"," | ","AUC - ", roc.auc.glmnet@y.values), colorize = TRUE) # plot ROC curve
# lines(c(0, 1), c(0, 1), col = "gray", lty = 4)
#
#
# predicted <- predict(model.glmnet, X_val, type = "response")
#
# # Calculate true positive rate and false positive rate on the prediction object
# perf <- performance(prediction(predicted, y_val), 'tpr', 'fpr')
# roc.auc.glmnet <- performance(prediction(predicted, y_val), "auc")
# plot(perf, main = paste("ROC - Elastic Net"," | ","AUC - ", roc.auc.glmnet@y.values), colorize = TRUE) # plot ROC curve
# lines(c(0, 1), c(0, 1), col = "gray", lty = 4)
#
#
#
# optCutOff <- optimalCutoff(y_val, predicted, optimiseFor = "Both", returnDiagnostics = T)
# optCutOff$
#
#
# #Misclassify Errors - needs to be low
# misClassError(y_val, predicted, threshold = optCutOff$optimalCutoff)
#
# # Concordance - needs to be high
# Concordance(y_val, predicted)
#
# sensitivity(y_val, predicted, threshold = optCutOff)
#
# specificity(y_val, predicted, threshold = optCutOff)
#
# # Confusion Matrix
# test <- confusionMatrix(y_val, predicted, threshold = optCutOff)
#
# ks_plot(y_val, predicted[1])
#
#
# predicted.class <- predict(model.glmnet, X_val, type = "class")
#
# # Output dataframe with probabilities
# output.data <- cbind(valid.cl.data, predicted)
# output.data <- cbind(output.data, predicted.class)
trControl <- trainControl(
method="cv",
number=7,
savePredictions="final",
index=createResample(as.factor(train.data$STRIKE), 7),
classProbs = TRUE,
summaryFunction = twoClassSummary,
allowParallel =TRUE
)
# col_sample_rate
# <chr>
# learn_rate
# <chr>
# max_depth
# <chr>
# ntrees
# <chr>
# sample_rate
# <chr>
# model_ids
# <chr>
# auc
# <chr>
# 0.2509 0.0072 7 203 0.7626 grid_binomial_xgb_55_model_6 0.7270077693205641
xgbTreeGrid <- expand.grid(nrounds = 500, max_depth = seq(2,8,by = 1), eta = 0.1, gamma = 0, colsample_bytree = 1.0, subsample = 1.0, min_child_weight = 4)
glmnetGridElastic <- expand.grid(.alpha = 0.3, .lambda = 0.009) ## notice the . before the parameter
# col_sample_rate
# <chr>
# learn_rate
# <chr>
# max_depth
# <chr>
# ntrees
# <chr>
# sample_rate
# <chr>
# model_ids
# <chr>
# auc
# <chr>
# 0.4087 0.1532 1 385 0.9297
gbm.tune.grid <- expand.grid(.n.trees = c(400), .interaction.depth = c(1, 3, 5), .shrinkage = c(.01, .1, .3), .n.minobsinnode = c(5, 10, 15))
set.seed(333)
modelList <- caretList(
STRIKE ~ .,
train.data,
trControl=trControl,
metric = "ROC",
verbose = TRUE,
tuneList=list(
## Do not use custom names in list. Will give prediction error with greedy ensemble. Bug in caret.
xgbTree = caretModelSpec(method="xgbTree", tuneGrid = xgbTreeGrid, nthread = 8),
glmnet = caretModelSpec(method = "glmnet", tuneGrid = glmnetGridElastic), ## Elastic, highly correlated with lasso and ridge regressions
rf = caretModelSpec(method = "rf", ntree = 2000, tuneLength = 20, tuneGrid = data.frame(mtry = 10)), ## rf
gbm = caretModelSpec(method = "gbm", tuneGrid = gbm.tune.grid)
)
)
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.3807 nan 0.0100 0.0016
## 2 1.3775 nan 0.0100 0.0016
## 3 1.3743 nan 0.0100 0.0015
## 4 1.3712 nan 0.0100 0.0015
## 5 1.3680 nan 0.0100 0.0014
## 6 1.3650 nan 0.0100 0.0014
## 7 1.3623 nan 0.0100 0.0013
## 8 1.3592 nan 0.0100 0.0014
## 9 1.3562 nan 0.0100 0.0014
## 10 1.3536 nan 0.0100 0.0012
## 20 1.3280 nan 0.0100 0.0011
## 40 1.2882 nan 0.0100 0.0008
## 60 1.2591 nan 0.0100 0.0005
## 80 1.2375 nan 0.0100 0.0005
## 100 1.2207 nan 0.0100 0.0003
## 120 1.2081 nan 0.0100 0.0002
## 140 1.1977 nan 0.0100 0.0001
## 160 1.1894 nan 0.0100 0.0000
## 180 1.1823 nan 0.0100 0.0001
## 200 1.1769 nan 0.0100 0.0001
## 220 1.1717 nan 0.0100 0.0001
## 240 1.1670 nan 0.0100 0.0001
## 260 1.1636 nan 0.0100 -0.0000
## 280 1.1601 nan 0.0100 -0.0000
## 300 1.1567 nan 0.0100 -0.0000
## 320 1.1539 nan 0.0100 -0.0000
## 340 1.1511 nan 0.0100 -0.0000
## 360 1.1485 nan 0.0100 -0.0000
## 380 1.1461 nan 0.0100 -0.0001
## 400 1.1436 nan 0.0100 -0.0000
# gbm.pred <- predict(modelList$gbm, newdata = test.data, type = 'raw')
#
# ## run MLeval
# res <- evalm(modelList$gbm)
#
#
# ## get ROC
#
# res$roc
#
# ## get calibration curve
#
# res$cc
#
# ## get precision recall gain curve
#
# res$prg
#
# caret::confusionMatrix(
# reference = as.factor(test.data$STRIKE),
# data = gbm.pred,
# mode = 'everything',
# positive = 'YES'
# )
validateAndPrintResult(modelList$xgbTree, test.data)
## eXtreme Gradient Boosting
##
## 2813 samples
## 20 predictor
## 2 classes: 'NO', 'YES'
##
## No pre-processing
## Resampling: Cross-Validated (7 fold)
## Summary of sample sizes: 2813, 2813, 2813, 2813, 2813, 2813, ...
## Resampling results across tuning parameters:
##
## max_depth ROC Sens Spec
## 2 0.7258595 0.6501899 0.6938797
## 3 0.7088709 0.6424089 0.6845110
## 4 0.7005036 0.6267529 0.6831272
## 5 0.6967093 0.6232290 0.6722892
## 6 0.6951912 0.6188989 0.6751165
## 7 0.6927515 0.6205358 0.6723515
## 8 0.6909788 0.6225654 0.6689167
##
## Tuning parameter 'nrounds' was held constant at a value of 500
## Tuning
## parameter 'min_child_weight' was held constant at a value of 4
##
## Tuning parameter 'subsample' was held constant at a value of 1
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were nrounds = 500, max_depth = 2, eta
## = 0.1, gamma = 0, colsample_bytree = 1, min_child_weight = 4 and subsample = 1.
## ***MLeval: Machine Learning Model Evaluation***
## Input: caret train function object
## Not averaging probs.
## Group 1 type: cv
## Observations: 7255
## Number of groups: 1
## Observations per group: 7255
## Positive: YES
## Negative: NO
## Group: Group 1
## Positive: 3784
## Negative: 3471
## ***Performance Metrics***



## Group 1 Optimal Informedness = 0.353792721084333
## Group 1 AUC-ROC = 0.73

## Confusion Matrix and Statistics
##
## Reference
## Prediction NO YES
## NO 387 230
## YES 186 401
##
## Accuracy : 0.6545
## 95% CI : (0.6269, 0.6814)
## No Information Rate : 0.5241
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.3098
##
## Mcnemar's Test P-Value : 0.03501
##
## Sensitivity : 0.6355
## Specificity : 0.6754
## Pos Pred Value : 0.6831
## Neg Pred Value : 0.6272
## Precision : 0.6831
## Recall : 0.6355
## F1 : 0.6585
## Prevalence : 0.5241
## Detection Rate : 0.3331
## Detection Prevalence : 0.4875
## Balanced Accuracy : 0.6554
##
## 'Positive' Class : YES
##
validateAndPrintResult(modelList$xgbTree, valid.cl.data)
## eXtreme Gradient Boosting
##
## 2813 samples
## 20 predictor
## 2 classes: 'NO', 'YES'
##
## No pre-processing
## Resampling: Cross-Validated (7 fold)
## Summary of sample sizes: 2813, 2813, 2813, 2813, 2813, 2813, ...
## Resampling results across tuning parameters:
##
## max_depth ROC Sens Spec
## 2 0.7258595 0.6501899 0.6938797
## 3 0.7088709 0.6424089 0.6845110
## 4 0.7005036 0.6267529 0.6831272
## 5 0.6967093 0.6232290 0.6722892
## 6 0.6951912 0.6188989 0.6751165
## 7 0.6927515 0.6205358 0.6723515
## 8 0.6909788 0.6225654 0.6689167
##
## Tuning parameter 'nrounds' was held constant at a value of 500
## Tuning
## parameter 'min_child_weight' was held constant at a value of 4
##
## Tuning parameter 'subsample' was held constant at a value of 1
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were nrounds = 500, max_depth = 2, eta
## = 0.1, gamma = 0, colsample_bytree = 1, min_child_weight = 4 and subsample = 1.
## ***MLeval: Machine Learning Model Evaluation***
## Input: caret train function object
## Not averaging probs.
## Group 1 type: cv
## Observations: 7255
## Number of groups: 1
## Observations per group: 7255
## Positive: YES
## Negative: NO
## Group: Group 1
## Positive: 3784
## Negative: 3471
## ***Performance Metrics***



## Group 1 Optimal Informedness = 0.353792721084333
## Group 1 AUC-ROC = 0.73

## Confusion Matrix and Statistics
##
## Reference
## Prediction NO YES
## NO 73 32
## YES 83 115
##
## Accuracy : 0.6205
## 95% CI : (0.5632, 0.6753)
## No Information Rate : 0.5149
## P-Value [Acc > NIR] : 0.0001362
##
## Kappa : 0.2478
##
## Mcnemar's Test P-Value : 3.124e-06
##
## Sensitivity : 0.7823
## Specificity : 0.4679
## Pos Pred Value : 0.5808
## Neg Pred Value : 0.6952
## Precision : 0.5808
## Recall : 0.7823
## F1 : 0.6667
## Prevalence : 0.4851
## Detection Rate : 0.3795
## Detection Prevalence : 0.6535
## Balanced Accuracy : 0.6251
##
## 'Positive' Class : YES
##
validateAndPrintResult(modelList$glmnet, test.data)
## glmnet
##
## 2813 samples
## 20 predictor
## 2 classes: 'NO', 'YES'
##
## No pre-processing
## Resampling: Cross-Validated (7 fold)
## Summary of sample sizes: 2813, 2813, 2813, 2813, 2813, 2813, ...
## Resampling results:
##
## ROC Sens Spec
## 0.7445988 0.7169431 0.6709229
##
## Tuning parameter 'alpha' was held constant at a value of 0.3
## Tuning
## parameter 'lambda' was held constant at a value of 0.009
## ***MLeval: Machine Learning Model Evaluation***
## Input: caret train function object
## Not averaging probs.
## Group 1 type: cv
## Observations: 7255
## Number of groups: 1
## Observations per group: 7255
## Positive: YES
## Negative: NO
## Group: Group 1
## Positive: 3784
## Negative: 3471
## ***Performance Metrics***



## Group 1 Optimal Informedness = 0.39212299981179
## Group 1 AUC-ROC = 0.74

## Confusion Matrix and Statistics
##
## Reference
## Prediction NO YES
## NO 405 231
## YES 168 400
##
## Accuracy : 0.6686
## 95% CI : (0.6412, 0.6952)
## No Information Rate : 0.5241
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.339
##
## Mcnemar's Test P-Value : 0.00191
##
## Sensitivity : 0.6339
## Specificity : 0.7068
## Pos Pred Value : 0.7042
## Neg Pred Value : 0.6368
## Precision : 0.7042
## Recall : 0.6339
## F1 : 0.6672
## Prevalence : 0.5241
## Detection Rate : 0.3322
## Detection Prevalence : 0.4718
## Balanced Accuracy : 0.6704
##
## 'Positive' Class : YES
##
validateAndPrintResult(modelList$glmnet, valid.cl.data)
## glmnet
##
## 2813 samples
## 20 predictor
## 2 classes: 'NO', 'YES'
##
## No pre-processing
## Resampling: Cross-Validated (7 fold)
## Summary of sample sizes: 2813, 2813, 2813, 2813, 2813, 2813, ...
## Resampling results:
##
## ROC Sens Spec
## 0.7445988 0.7169431 0.6709229
##
## Tuning parameter 'alpha' was held constant at a value of 0.3
## Tuning
## parameter 'lambda' was held constant at a value of 0.009
## ***MLeval: Machine Learning Model Evaluation***
## Input: caret train function object
## Not averaging probs.
## Group 1 type: cv
## Observations: 7255
## Number of groups: 1
## Observations per group: 7255
## Positive: YES
## Negative: NO
## Group: Group 1
## Positive: 3784
## Negative: 3471
## ***Performance Metrics***



## Group 1 Optimal Informedness = 0.39212299981179
## Group 1 AUC-ROC = 0.74

## Confusion Matrix and Statistics
##
## Reference
## Prediction NO YES
## NO 88 33
## YES 68 114
##
## Accuracy : 0.6667
## 95% CI : (0.6105, 0.7195)
## No Information Rate : 0.5149
## P-Value [Acc > NIR] : 6.278e-08
##
## Kappa : 0.3373
##
## Mcnemar's Test P-Value : 0.0007167
##
## Sensitivity : 0.7755
## Specificity : 0.5641
## Pos Pred Value : 0.6264
## Neg Pred Value : 0.7273
## Precision : 0.6264
## Recall : 0.7755
## F1 : 0.6930
## Prevalence : 0.4851
## Detection Rate : 0.3762
## Detection Prevalence : 0.6007
## Balanced Accuracy : 0.6698
##
## 'Positive' Class : YES
##
validateAndPrintResult(modelList$rf, test.data)
## Random Forest
##
## 2813 samples
## 20 predictor
## 2 classes: 'NO', 'YES'
##
## No pre-processing
## Resampling: Cross-Validated (7 fold)
## Summary of sample sizes: 2813, 2813, 2813, 2813, 2813, 2813, ...
## Resampling results:
##
## ROC Sens Spec
## 0.7181348 0.6595506 0.6801559
##
## Tuning parameter 'mtry' was held constant at a value of 10
## ***MLeval: Machine Learning Model Evaluation***
## Input: caret train function object
## Not averaging probs.
## Group 1 type: cv
## Observations: 7255
## Number of groups: 1
## Observations per group: 7255
## Positive: YES
## Negative: NO
## Group: Group 1
## Positive: 3784
## Negative: 3471
## ***Performance Metrics***



## Group 1 Optimal Informedness = 0.339792012708135
## Group 1 AUC-ROC = 0.72

## Confusion Matrix and Statistics
##
## Reference
## Prediction NO YES
## NO 391 242
## YES 182 389
##
## Accuracy : 0.6478
## 95% CI : (0.6201, 0.6748)
## No Information Rate : 0.5241
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.2974
##
## Mcnemar's Test P-Value : 0.004166
##
## Sensitivity : 0.6165
## Specificity : 0.6824
## Pos Pred Value : 0.6813
## Neg Pred Value : 0.6177
## Precision : 0.6813
## Recall : 0.6165
## F1 : 0.6473
## Prevalence : 0.5241
## Detection Rate : 0.3231
## Detection Prevalence : 0.4743
## Balanced Accuracy : 0.6494
##
## 'Positive' Class : YES
##
validateAndPrintResult(modelList$rf, valid.cl.data)
## Random Forest
##
## 2813 samples
## 20 predictor
## 2 classes: 'NO', 'YES'
##
## No pre-processing
## Resampling: Cross-Validated (7 fold)
## Summary of sample sizes: 2813, 2813, 2813, 2813, 2813, 2813, ...
## Resampling results:
##
## ROC Sens Spec
## 0.7181348 0.6595506 0.6801559
##
## Tuning parameter 'mtry' was held constant at a value of 10
## ***MLeval: Machine Learning Model Evaluation***
## Input: caret train function object
## Not averaging probs.
## Group 1 type: cv
## Observations: 7255
## Number of groups: 1
## Observations per group: 7255
## Positive: YES
## Negative: NO
## Group: Group 1
## Positive: 3784
## Negative: 3471
## ***Performance Metrics***



## Group 1 Optimal Informedness = 0.339792012708135
## Group 1 AUC-ROC = 0.72

## Confusion Matrix and Statistics
##
## Reference
## Prediction NO YES
## NO 84 31
## YES 72 116
##
## Accuracy : 0.6601
## 95% CI : (0.6037, 0.7133)
## No Information Rate : 0.5149
## P-Value [Acc > NIR] : 2.225e-07
##
## Kappa : 0.325
##
## Mcnemar's Test P-Value : 8.104e-05
##
## Sensitivity : 0.7891
## Specificity : 0.5385
## Pos Pred Value : 0.6170
## Neg Pred Value : 0.7304
## Precision : 0.6170
## Recall : 0.7891
## F1 : 0.6925
## Prevalence : 0.4851
## Detection Rate : 0.3828
## Detection Prevalence : 0.6205
## Balanced Accuracy : 0.6638
##
## 'Positive' Class : YES
##
validateAndPrintResult(modelList$gbm, test.data)
## Stochastic Gradient Boosting
##
## 2813 samples
## 20 predictor
## 2 classes: 'NO', 'YES'
##
## No pre-processing
## Resampling: Cross-Validated (7 fold)
## Summary of sample sizes: 2813, 2813, 2813, 2813, 2813, 2813, ...
## Resampling results across tuning parameters:
##
## shrinkage interaction.depth n.minobsinnode ROC Sens Spec
## 0.01 1 5 0.7405019 0.6775723 0.6978071
## 0.01 1 10 0.7398820 0.6778053 0.6981943
## 0.01 1 15 0.7400998 0.6780858 0.6987150
## 0.01 3 5 0.7500151 0.6730403 0.7134005
## 0.01 3 10 0.7498429 0.6749600 0.7107830
## 0.01 3 15 0.7494104 0.6757906 0.7118014
## 0.01 5 5 0.7482403 0.6713271 0.7096891
## 0.01 5 10 0.7486500 0.6716447 0.7132164
## 0.01 5 15 0.7484035 0.6727652 0.7145545
## 0.10 1 5 0.7371101 0.6614214 0.7009731
## 0.10 1 10 0.7358510 0.6619505 0.6999148
## 0.10 1 15 0.7362413 0.6660909 0.7014915
## 0.10 3 5 0.7183773 0.6384029 0.6898557
## 0.10 3 10 0.7191055 0.6418065 0.6860941
## 0.10 3 15 0.7187297 0.6404175 0.6903453
## 0.10 5 5 0.7113287 0.6346190 0.6784474
## 0.10 5 10 0.7054848 0.6252153 0.6850435
## 0.10 5 15 0.7071891 0.6257201 0.6819229
## 0.30 1 5 0.7157828 0.6374494 0.6890194
## 0.30 1 10 0.7167345 0.6424918 0.6912493
## 0.30 1 15 0.7201906 0.6451942 0.6954336
## 0.30 3 5 0.6845439 0.6067691 0.6763076
## 0.30 3 10 0.6797716 0.6064654 0.6597695
## 0.30 3 15 0.6802359 0.5973145 0.6708190
## 0.30 5 5 0.6748662 0.5967975 0.6578960
## 0.30 5 10 0.6719867 0.6032923 0.6568789
## 0.30 5 15 0.6757455 0.6042790 0.6569964
##
## Tuning parameter 'n.trees' was held constant at a value of 400
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 400, interaction.depth =
## 3, shrinkage = 0.01 and n.minobsinnode = 5.
## ***MLeval: Machine Learning Model Evaluation***
## Input: caret train function object
## Not averaging probs.
## Group 1 type: cv
## Observations: 7255
## Number of groups: 1
## Observations per group: 7255
## Positive: YES
## Negative: NO
## Group: Group 1
## Positive: 3784
## Negative: 3471
## ***Performance Metrics***



## Group 1 Optimal Informedness = 0.400352010588488
## Group 1 AUC-ROC = 0.75

## Confusion Matrix and Statistics
##
## Reference
## Prediction NO YES
## NO 387 217
## YES 186 414
##
## Accuracy : 0.6653
## 95% CI : (0.6378, 0.6919)
## No Information Rate : 0.5241
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.3307
##
## Mcnemar's Test P-Value : 0.1351
##
## Sensitivity : 0.6561
## Specificity : 0.6754
## Pos Pred Value : 0.6900
## Neg Pred Value : 0.6407
## Precision : 0.6900
## Recall : 0.6561
## F1 : 0.6726
## Prevalence : 0.5241
## Detection Rate : 0.3439
## Detection Prevalence : 0.4983
## Balanced Accuracy : 0.6657
##
## 'Positive' Class : YES
##
validateAndPrintResult(modelList$gbm, valid.cl.data)
## Stochastic Gradient Boosting
##
## 2813 samples
## 20 predictor
## 2 classes: 'NO', 'YES'
##
## No pre-processing
## Resampling: Cross-Validated (7 fold)
## Summary of sample sizes: 2813, 2813, 2813, 2813, 2813, 2813, ...
## Resampling results across tuning parameters:
##
## shrinkage interaction.depth n.minobsinnode ROC Sens Spec
## 0.01 1 5 0.7405019 0.6775723 0.6978071
## 0.01 1 10 0.7398820 0.6778053 0.6981943
## 0.01 1 15 0.7400998 0.6780858 0.6987150
## 0.01 3 5 0.7500151 0.6730403 0.7134005
## 0.01 3 10 0.7498429 0.6749600 0.7107830
## 0.01 3 15 0.7494104 0.6757906 0.7118014
## 0.01 5 5 0.7482403 0.6713271 0.7096891
## 0.01 5 10 0.7486500 0.6716447 0.7132164
## 0.01 5 15 0.7484035 0.6727652 0.7145545
## 0.10 1 5 0.7371101 0.6614214 0.7009731
## 0.10 1 10 0.7358510 0.6619505 0.6999148
## 0.10 1 15 0.7362413 0.6660909 0.7014915
## 0.10 3 5 0.7183773 0.6384029 0.6898557
## 0.10 3 10 0.7191055 0.6418065 0.6860941
## 0.10 3 15 0.7187297 0.6404175 0.6903453
## 0.10 5 5 0.7113287 0.6346190 0.6784474
## 0.10 5 10 0.7054848 0.6252153 0.6850435
## 0.10 5 15 0.7071891 0.6257201 0.6819229
## 0.30 1 5 0.7157828 0.6374494 0.6890194
## 0.30 1 10 0.7167345 0.6424918 0.6912493
## 0.30 1 15 0.7201906 0.6451942 0.6954336
## 0.30 3 5 0.6845439 0.6067691 0.6763076
## 0.30 3 10 0.6797716 0.6064654 0.6597695
## 0.30 3 15 0.6802359 0.5973145 0.6708190
## 0.30 5 5 0.6748662 0.5967975 0.6578960
## 0.30 5 10 0.6719867 0.6032923 0.6568789
## 0.30 5 15 0.6757455 0.6042790 0.6569964
##
## Tuning parameter 'n.trees' was held constant at a value of 400
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 400, interaction.depth =
## 3, shrinkage = 0.01 and n.minobsinnode = 5.
## ***MLeval: Machine Learning Model Evaluation***
## Input: caret train function object
## Not averaging probs.
## Group 1 type: cv
## Observations: 7255
## Number of groups: 1
## Observations per group: 7255
## Positive: YES
## Negative: NO
## Group: Group 1
## Positive: 3784
## Negative: 3471
## ***Performance Metrics***



## Group 1 Optimal Informedness = 0.400352010588488
## Group 1 AUC-ROC = 0.75

## Confusion Matrix and Statistics
##
## Reference
## Prediction NO YES
## NO 86 29
## YES 70 118
##
## Accuracy : 0.6733
## 95% CI : (0.6173, 0.7258)
## No Information Rate : 0.5149
## P-Value [Acc > NIR] : 1.672e-08
##
## Kappa : 0.3512
##
## Mcnemar's Test P-Value : 5.816e-05
##
## Sensitivity : 0.8027
## Specificity : 0.5513
## Pos Pred Value : 0.6277
## Neg Pred Value : 0.7478
## Precision : 0.6277
## Recall : 0.8027
## F1 : 0.7045
## Prevalence : 0.4851
## Detection Rate : 0.3894
## Detection Prevalence : 0.6205
## Balanced Accuracy : 0.6770
##
## 'Positive' Class : YES
##
set.seed(333)
greedyEnsemble <- caretEnsemble(
c(modelList$rf, modelList$xgbTree, modelList$gbm),
metric="ROC",
trControl = trainControl(
number = 7,
method = "cv",
classProbs = TRUE,
verboseIter = TRUE
))
## Warning in train.default(predobs$preds, predobs$obs, ...): The metric "ROC" was
## not in the result set. Accuracy will be used instead.
## Aggregating results
## Fitting final model on full training set
## A glm ensemble of 3 base models: rf1, xgbTree2, gbm3
##
## Ensemble results:
## Generalized Linear Model
##
## 7255 samples
## 3 predictor
## 2 classes: 'NO', 'YES'
##
## No pre-processing
## Resampling: Cross-Validated (7 fold)
## Summary of sample sizes: 6218, 6218, 6219, 6219, 6219, 6218, ...
## Resampling results:
##
## Accuracy Kappa
## 0.6941428 0.3870312
test.pred <- predict(greedyEnsemble, newdata = test.data)
caret::confusionMatrix(
reference = as.factor(test.data$STRIKE),
data = test.pred,
mode = 'everything',
positive = 'YES'
)
## Confusion Matrix and Statistics
##
## Reference
## Prediction NO YES
## NO 392 218
## YES 181 413
##
## Accuracy : 0.6686
## 95% CI : (0.6412, 0.6952)
## No Information Rate : 0.5241
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.3376
##
## Mcnemar's Test P-Value : 0.07151
##
## Sensitivity : 0.6545
## Specificity : 0.6841
## Pos Pred Value : 0.6953
## Neg Pred Value : 0.6426
## Precision : 0.6953
## Recall : 0.6545
## F1 : 0.6743
## Prevalence : 0.5241
## Detection Rate : 0.3430
## Detection Prevalence : 0.4934
## Balanced Accuracy : 0.6693
##
## 'Positive' Class : YES
##
valid.pred <- predict(greedyEnsemble, newdata = valid.cl.data)
caret::confusionMatrix(
reference = as.factor(valid.cl.data$STRIKE),
data = valid.pred,
mode = 'everything',
positive = 'YES'
)
## Confusion Matrix and Statistics
##
## Reference
## Prediction NO YES
## NO 87 29
## YES 69 118
##
## Accuracy : 0.6766
## 95% CI : (0.6207, 0.7289)
## No Information Rate : 0.5149
## P-Value [Acc > NIR] : 8.446e-09
##
## Kappa : 0.3576
##
## Mcnemar's Test P-Value : 8.162e-05
##
## Sensitivity : 0.8027
## Specificity : 0.5577
## Pos Pred Value : 0.6310
## Neg Pred Value : 0.7500
## Precision : 0.6310
## Recall : 0.8027
## F1 : 0.7066
## Prevalence : 0.4851
## Detection Rate : 0.3894
## Detection Prevalence : 0.6172
## Balanced Accuracy : 0.6802
##
## 'Positive' Class : YES
##
# Confusion Matrix and Statistics
#
# Reference
# Prediction NO YES
# NO 703 297
# YES 235 497
#
# Accuracy : 0.6928
# 95% CI : (0.6705, 0.7145)
# No Information Rate : 0.5416
# P-Value [Acc > NIR] : < 2.2e-16
#
# Kappa : 0.3777
#
# Mcnemar's Test P-Value : 0.008177
#
# Sensitivity : 0.6259
# Specificity : 0.7495
# Pos Pred Value : 0.6790
# Neg Pred Value : 0.7030
# Precision : 0.6790
# Recall : 0.6259
# F1 : 0.6514
# Prevalence : 0.4584
# Detection Rate : 0.2870
# Detection Prevalence : 0.4226
# Balanced Accuracy : 0.6877
#
# 'Positive' Class : YES
#
stack = caretStack(modelList, method="glm", trControl = trControl)
## Warning in train.default(predobs$preds, predobs$obs, ...): The metric "Accuracy"
## was not in the result set. ROC will be used instead.
## A glm ensemble of 4 base models: xgbTree, glmnet, rf, gbm
##
## Ensemble results:
## Generalized Linear Model
##
## 7255 samples
## 4 predictor
## 2 classes: 'NO', 'YES'
##
## No pre-processing
## Resampling: Cross-Validated (7 fold)
## Summary of sample sizes: 2813, 2813, 2813, 2813, 2813, 2813, ...
## Resampling results:
##
## ROC Sens Spec
## 0.7524129 0.6847667 0.7041084
test.pred <- predict(stack, newdata = test.data)
caret::confusionMatrix(
reference = as.factor(test.data$STRIKE),
data = test.pred,
mode = 'everything',
positive = 'YES'
)
## Confusion Matrix and Statistics
##
## Reference
## Prediction NO YES
## NO 398 217
## YES 175 414
##
## Accuracy : 0.6744
## 95% CI : (0.6471, 0.7008)
## No Information Rate : 0.5241
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.3495
##
## Mcnemar's Test P-Value : 0.03838
##
## Sensitivity : 0.6561
## Specificity : 0.6946
## Pos Pred Value : 0.7029
## Neg Pred Value : 0.6472
## Precision : 0.7029
## Recall : 0.6561
## F1 : 0.6787
## Prevalence : 0.5241
## Detection Rate : 0.3439
## Detection Prevalence : 0.4892
## Balanced Accuracy : 0.6753
##
## 'Positive' Class : YES
##
valid.pred <- predict(stack, newdata = valid.cl.data)
caret::confusionMatrix(
reference = as.factor(valid.cl.data$STRIKE),
data = valid.pred,
mode = 'everything',
positive = 'YES'
)
## Confusion Matrix and Statistics
##
## Reference
## Prediction NO YES
## NO 87 31
## YES 69 116
##
## Accuracy : 0.67
## 95% CI : (0.6139, 0.7227)
## No Information Rate : 0.5149
## P-Value [Acc > NIR] : 3.264e-08
##
## Kappa : 0.3442
##
## Mcnemar's Test P-Value : 0.0002156
##
## Sensitivity : 0.7891
## Specificity : 0.5577
## Pos Pred Value : 0.6270
## Neg Pred Value : 0.7373
## Precision : 0.6270
## Recall : 0.7891
## F1 : 0.6988
## Prevalence : 0.4851
## Detection Rate : 0.3828
## Detection Prevalence : 0.6106
## Balanced Accuracy : 0.6734
##
## 'Positive' Class : YES
##
# # all others may have just failed and are not listed here
# models.cla <- c("knn", "AdaBoost.M1", "rf", 'glmnet', 'xgboost')
#
# # register parallel front-end
# cl.cla <- makeCluster(detectCores())
# registerDoParallel(cl.cla)
#
# # this setup actually calls the caret::train function, in order to provide
# # minimal error handling this type of construct is needed.
# trainCall <- function(i)
# {
# cat("----------------------------------------------------",
# "\n")
#
# set.seed(123)
# cat(i, " <- loaded\n")
#
# t2 <-
# train(
# train.data[, -11],
# train.data[, c('STRIKE')],
# method = i,
# trControl = trainControl(method = "boot632",
# number = 5)
# )
# }
#
# # use lapply/loop to run everything, required for try/catch error function to work
# t2 <- lapply(models.cla, trainCall)
#
# #remove NULL values, we only allow succesful methods, provenance is deleted.
# t2 <- t2[!sapply(t2, is.null)]
#
# # this setup extracts the results with minimal error handling
# # TrainKappa can be sometimes zero, but Accuracy SD can be still available
# printCall <- function(i)
# {
# return(tryCatch({
# cat(sprintf("%-22s", (models.cla[i])))
# cat(round(getTrainPerf(t2[[i]])$TrainAccuracy, 4), "\t")
# cat(round(getTrainPerf(t2[[i]])$TrainKappa, 4), "\t")
# cat(t2[[i]]$times$everything[3], "\n")
# },
# error = function(e)
# NULL))
# }
#
# r2 <- lapply(1:length(t2), printCall)
#
# # stop cluster and register sequntial front end
# stopCluster(cl.cla)
# registerDoSEQ()
#
#
# # preallocate data types
# i = 1; MAX = length(t2);
# x1 <- character() # Name
# x2 <- numeric() # R2
# x3 <- numeric() # RMSE
# x4 <- numeric() # time [s]
# x5 <- character() # long model name
#
# # fill data and check indexes and NA with loop/lapply
# for (i in 1:length(t2)) {
# x1[i] <- t2[[i]]$method
# x2[i] <-
# as.numeric(round(getTrainPerf(t2[[i]])$TrainAccuracy, 4))
# x3[i] <- as.numeric(round(getTrainPerf(t2[[i]])$TrainKappa, 4))
# x4[i] <- as.numeric(t2[[i]]$times$everything[3])
# x5[i] <- t2[[i]]$modelInfo$label
# }
#
# # coerce to data frame
# df1 <- data.frame(x1, x2, x3, x4, x5, stringsAsFactors = FALSE)
#
# # print all results to R-GUI
# df1
#
# # plot models, just as example
# # ggplot(t2[[1]])
# # ggplot(t2[[1]])
#
# # call web output with correct column names
# datatable(
# df1,
# options = list(
# columnDefs = list(list(
# className = 'dt-left', targets = c(0, 1, 2, 3, 4, 5)
# )),
# pageLength = MAX,
# order = list(list(2, 'desc'))
# ),
# colnames = c('Num', 'Name', 'Accuracy', 'Kappa', 'time [s]', 'Model name'),
# caption = paste('Classification results from caret models', Sys.time()),
# class = 'cell-border stripe'
# ) %>%
# formatRound('x2', 3) %>%
# formatRound('x3', 3) %>%
# formatRound('x4', 3) %>%
# formatStyle(
# 2,
# background = styleColorBar(x2, 'steelblue'),
# backgroundSize = '100% 90%',
# backgroundRepeat = 'no-repeat',
# backgroundPosition = 'center'
# )
#
# # print confusion matrix example
# caret::confusionMatrix(t2[[1]])
# # XGBoost ####
#
# modelXGB_sample <- xgboost(
# data = as.matrix(train.data[, -c(11, 9,10)]),
# label = as.matrix(train.data[,11]),
# nrounds = 50,
# # optimal is 97
# max_depth = 50,
# # maximum depth of tree
# eta = 0.3,
# # step size shrinkage, learning rate
# nthread = 4,
# # number of threads to be used. 16 cores available
# "gamma" = 0,
# # minimum loss reduction, controls regularisation
# objective = "binary:logistic",
# min_child_weight = 1,
# # minimum number of instances required in a child node
# subsample = 1,
# # controls number of samples supplied to a tree
# colsample_bytree = 1,
# # controls number of features supplied to a tree
# save_period = NULL
# ) # controls number of features supplied to a tree
#
# test.pred <- predict(modelXGB_sample, newdata = as.matrix(test.data[, -c(11, 9,10)]))
#
# caret::confusionMatrix(
# reference = as.matrix(as.factor(test.data$STRIKE)),
# data = test.pred,
# mode = 'everything',
# positive = 'YES'
# )
#
# valid.pred <- predict(modelXGB_sample, newdata = valid.cl.data)
#
# caret::confusionMatrix(
# reference = as.factor(valid.cl.data$STRIKE),
# data = valid.pred,
# mode = 'everything',
# positive = 'YES'
# )
# prob_predXGB_sample <- predict(modelXGB_sample, newdata = as.matrix(test.data[,-c(11,9,10)])) # Predict the Test set results (probabilities)
# predictXGB_sample = ifelse(prob_predXGB_sample > 0.5, 1, 0) # convert probabilities to binary
#
# cmXGB_sample <- table(predictXGB_sample>0.7, test.data$STRIKE)
# cmXGB_sample # Confusion matrix
# errorXGB_sample <- 100*(1-sum(diag(cmXGB_sample))/sum(cmXGB_sample))
# errorXGB_sample # error rate
# accuracyXGB_sample <- 100 - errorXGB_sample
# accuracyXGB_sample # accuracy rate
# precisionXGB_sample <- 100*cmXGB_sample[2,2]/sum(cmXGB_sample[2,1],cmXGB_sample[2,2])
# precisionXGB_sample # precision
# recallXGB_sample <- 100*cmXGB_sample[2,2]/sum(cmXGB_sample[1,2],cmXGB_sample[2,2])
# recallXGB_sample # recall
# FscoreXGB_sample <- 2*precisionXGB_sample*recallXGB_sample/(precisionXGB_sample+recallXGB_sample)
# FscoreXGB_sample # F-score
# xgb.pred <-
# prediction(prob_predXGB_sample, test.data)
# xgb.perf <-
# performance(xgb.pred, "tpr", "fpr")
# plot(
# xgb.perf,
# avg = "threshold",
# colorize = TRUE,
# lwd = 1,
# main = "ROC Curve w/ Thresholds",
# print.cutoffs.at = seq(0, 1, by = 0.05),
# text.adj = c(-0.5, 0.5),
# text.cex = 0.1
# )
# grid(col = "lightgray")
# axis(1, at = seq(0, 1, by = 0.1))
# axis(2, at = seq(0, 1, by = 0.1))
# abline(v = c(0.1, 0.3, 0.5, 0.7, 0.9),
# col = "lightgray",
# lty = "dotted") abline(h = c(0.1, 0.3, 0.5, 0.7, 0.9),
# col = "lightgray",
# lty = "dotted") lines(
# x = c(0, 1),
# y = c(0, 1),
# col = "black",
# lty = "dotted"
# )
h2o.data <- class.data
# # one-hot-encoding categorical features
# ohe_feats = c('MONTH', 'SEASON')
#
# # Create dummies
# dummies <- dummyVars(~ MONTH + SEASON, data = h2o.data)
#
# df.dummies <- as.data.frame(predict(dummies, newdata = h2o.data))
# # Merge Dummies to data frame
# h2o.data <-
# cbind(h2o.data[, -c(which(colnames(h2o.data) %in% ohe_feats))], df.dummies)
# h2o.data <-
# subset(h2o.data, select = -c(YEAR.2013, YEAR.2019))
# Create the training and test datasets
set.seed(100)
h2o.data$STRIKE <- as.factor(h2o.data$STRIKE)
# Step 1: Get row numbers for the training data
trainRowNumbers.cl <-
createDataPartition(h2o.data$STRIKE, p = 0.75, list = FALSE)
# Step 2: Create the training dataset
train.data <- h2o.data[trainRowNumbers.cl, ]
# Step 3: Create the test dataset
test.data <- h2o.data[-trainRowNumbers.cl, ]
train.data <- as.h2o(train.data)
##
|
| | 0%
|
|======================================================================| 100%
test.data <- as.h2o(test.data)
##
|
| | 0%
|
|======================================================================| 100%
# Identify predictors and response
y <- "STRIKE"
x <- setdiff(names(h2o.data), c("STRIKE"))
# For binary classification, response should be a factor
train.data[,y] <- as.factor(train.data[,y])
test.data[,y] <- as.factor(test.data[,y])
# Number of CV folds (to generate level-one data for stacking)
nfolds <- 5
# 2. Generate a random grid of models and stack them together
# Some XGboost/GBM /rf hyperparameters
hyper_params <- list(ntrees = seq(10, 1000, 1),
learn_rate = seq(0.0001, 0.2, 0.0001),
max_depth = seq(1, 20, 1),
sample_rate = seq(0.5, 1.0, 0.0001),
col_sample_rate = seq(0.2, 1.0, 0.0001))
search_criteria <- list(strategy = "RandomDiscrete",
max_models = 10)
grid.id <- as.character(format(Sys.time(), "%S"))
# Train & Cross-validate a RF
rf_grid <- h2o.grid(algorithm = "drf",
grid_id = paste0("grid_binomial_rf_",grid.id),
x = x,
y = y,
training_frame = train.data,
seed = 100,
nfolds = nfolds,
ntrees = 2500,
fold_assignment = "Modulo",
keep_cross_validation_predictions = TRUE)
##
|
| | 0%
|
| | 1%
|
|= | 1%
|
|= | 2%
|
|== | 3%
|
|=== | 4%
|
|==== | 6%
|
|====== | 8%
|
|======= | 10%
|
|========= | 12%
|
|========== | 14%
|
|=========== | 16%
|
|============= | 18%
|
|============== | 20%
|
|=============== | 22%
|
|================= | 24%
|
|================== | 26%
|
|=================== | 27%
|
|==================== | 29%
|
|====================== | 31%
|
|======================= | 33%
|
|========================= | 35%
|
|========================= | 36%
|
|========================== | 37%
|
|=========================== | 38%
|
|============================ | 40%
|
|============================= | 41%
|
|============================== | 43%
|
|=============================== | 44%
|
|=============================== | 45%
|
|================================ | 46%
|
|================================== | 48%
|
|=================================== | 50%
|
|==================================== | 52%
|
|====================================== | 54%
|
|======================================= | 56%
|
|========================================= | 58%
|
|========================================== | 60%
|
|============================================ | 62%
|
|============================================= | 64%
|
|============================================== | 66%
|
|=============================================== | 68%
|
|================================================= | 70%
|
|================================================== | 71%
|
|=================================================== | 73%
|
|==================================================== | 75%
|
|====================================================== | 77%
|
|======================================================= | 79%
|
|========================================================= | 81%
|
|========================================================== | 83%
|
|========================================================== | 84%
|
|=========================================================== | 84%
|
|=========================================================== | 85%
|
|============================================================ | 85%
|
|============================================================ | 86%
|
|============================================================= | 87%
|
|============================================================== | 88%
|
|============================================================== | 89%
|
|=============================================================== | 90%
|
|================================================================ | 91%
|
|================================================================ | 92%
|
|================================================================= | 93%
|
|================================================================= | 94%
|
|================================================================== | 95%
|
|=================================================================== | 96%
|
|==================================================================== | 96%
|
|==================================================================== | 97%
|
|===================================================================== | 98%
|
|======================================================================| 99%
|
|======================================================================| 100%
gbm_grid <- h2o.grid(algorithm = "gbm",
grid_id = paste0("grid_binomial_gbm_",grid.id),
x = x,
y = y,
training_frame = train.data,
# ntrees = seq(10, 1000, 1),
seed = 100,
nfolds = nfolds,
fold_assignment = "Modulo",
keep_cross_validation_predictions = TRUE,
hyper_params = hyper_params,
search_criteria = search_criteria)
##
|
| | 0%
|
|= | 1%
|
|= | 2%
|
|== | 3%
|
|=== | 4%
|
|=== | 5%
|
|==== | 6%
|
|===== | 6%
|
|===== | 7%
|
|====== | 8%
|
|====== | 9%
|
|======= | 9%
|
|======= | 10%
|
|======== | 11%
|
|======== | 12%
|
|========= | 12%
|
|========= | 13%
|
|========== | 14%
|
|========== | 15%
|
|=========== | 15%
|
|=========== | 16%
|
|============ | 17%
|
|============= | 18%
|
|============== | 20%
|
|=============== | 22%
|
|================ | 22%
|
|================ | 23%
|
|================= | 24%
|
|================= | 25%
|
|================== | 25%
|
|================== | 26%
|
|=================== | 27%
|
|=================== | 28%
|
|==================== | 28%
|
|==================== | 29%
|
|===================== | 30%
|
|===================== | 31%
|
|====================== | 32%
|
|======================== | 34%
|
|========================= | 36%
|
|=========================== | 38%
|
|============================ | 40%
|
|============================= | 41%
|
|============================= | 42%
|
|============================== | 42%
|
|============================== | 43%
|
|=============================== | 44%
|
|================================ | 45%
|
|================================ | 46%
|
|================================= | 47%
|
|================================= | 48%
|
|==================================== | 51%
|
|====================================== | 54%
|
|====================================== | 55%
|
|======================================= | 55%
|
|======================================= | 56%
|
|======================================== | 57%
|
|========================================= | 58%
|
|========================================= | 59%
|
|========================================== | 59%
|
|========================================== | 60%
|
|=========================================== | 61%
|
|=========================================== | 62%
|
|============================================ | 62%
|
|============================================ | 63%
|
|============================================= | 64%
|
|============================================= | 65%
|
|============================================== | 65%
|
|============================================== | 66%
|
|=============================================== | 67%
|
|================================================ | 68%
|
|================================================ | 69%
|
|================================================= | 70%
|
|================================================== | 71%
|
|================================================== | 72%
|
|=================================================== | 72%
|
|=================================================== | 73%
|
|==================================================== | 74%
|
|===================================================== | 75%
|
|====================================================== | 77%
|
|====================================================== | 78%
|
|======================================================= | 78%
|
|======================================================================| 100%
# Train the grid
xgb_grid <- h2o.grid(algorithm = "xgboost",
grid_id = paste0("grid_binomial_xgb_",grid.id),
x = x,
y = y,
training_frame = train.data,
nfolds = nfolds,
seed = 100,
fold_assignment = "Modulo",
keep_cross_validation_predictions = TRUE,
hyper_params = hyper_params,
search_criteria = search_criteria)
##
|
| | 0%
|
| | 1%
|
|= | 2%
|
|== | 3%
|
|==== | 5%
|
|==== | 6%
|
|===== | 6%
|
|===== | 7%
|
|===== | 8%
|
|======= | 10%
|
|======= | 11%
|
|======== | 11%
|
|======== | 12%
|
|========= | 12%
|
|========= | 13%
|
|============ | 17%
|
|================ | 23%
|
|================== | 26%
|
|=================== | 27%
|
|==================== | 28%
|
|==================== | 29%
|
|===================== | 29%
|
|===================== | 30%
|
|======================= | 33%
|
|========================= | 36%
|
|=========================== | 39%
|
|============================ | 40%
|
|============================= | 41%
|
|============================= | 42%
|
|============================== | 42%
|
|============================== | 43%
|
|=============================== | 44%
|
|=============================== | 45%
|
|================================ | 45%
|
|================================= | 47%
|
|=================================== | 50%
|
|==================================== | 52%
|
|===================================== | 52%
|
|===================================== | 53%
|
|===================================== | 54%
|
|====================================== | 54%
|
|======================================= | 55%
|
|========================================= | 59%
|
|============================================ | 63%
|
|============================================== | 66%
|
|=============================================== | 67%
|
|================================================ | 69%
|
|================================================= | 70%
|
|================================================== | 71%
|
|================================================== | 72%
|
|=================================================== | 72%
|
|=================================================== | 73%
|
|==================================================== | 74%
|
|======================================================= | 79%
|
|=========================================================== | 84%
|
|=========================================================== | 85%
|
|============================================================ | 86%
|
|============================================================= | 87%
|
|=============================================================== | 91%
|
|=================================================================== | 95%
|
|======================================================================| 100%
# Train a stacked ensemble using the H2O and XGBoost models from above
base.models <- append(gbm_grid@model_ids,
xgb_grid@model_ids)
# Train a stacked ensemble using the GBM grid
ensemble <- h2o.stackedEnsemble(x = x,
y = y,
model_id = paste0("ensemble_gbm_grid_", grid.id, "_1"),
training_frame = train.data,
base_models = base.models)
##
|
| | 0%
|
|======================================================================| 100%
# Eval ensemble performance on a test set
perf <- h2o.performance(ensemble, newdata = test.data)
# Compare to base learner performance on the test set
.getauc <-
function(mm)
h2o.auc(h2o.performance(h2o.getModel(mm), newdata = test.data))
baselearner_aucs <- sapply(base.models, .getauc)
baselearner_best_auc_test <- max(baselearner_aucs)
ensemble_auc_test <- h2o.auc(perf)
print(sprintf("Best Base-learner Test AUC: %s", baselearner_best_auc_test))
## [1] "Best Base-learner Test AUC: 0.736546844424646"
print(sprintf("Ensemble Test AUC: %s", ensemble_auc_test))
## [1] "Ensemble Test AUC: 0.738965031738709"
# Generate predictions on a test set (if neccessary)
pred <- h2o.predict(ensemble, newdata = test.data)
##
|
| | 0%
|
|======================================================================| 100%
# Sort the grid by CV AUC for GBM
get_gbm_grid <- h2o.getGrid(grid_id = gbm_grid@grid_id, sort_by = "AUC", decreasing = TRUE)
get_gbm_grid
## H2O Grid Details
## ================
##
## Grid ID: grid_binomial_gbm_14
## Used hyper parameters:
## - col_sample_rate
## - learn_rate
## - max_depth
## - ntrees
## - sample_rate
## Number of models: 10
## Number of failed models: 0
##
## Hyper-Parameter Search Summary: ordered by decreasing AUC
## col_sample_rate learn_rate max_depth ntrees sample_rate
## 1 0.9046 0.0228 10 28 0.6884
## 2 0.9203 0.0021 12 76 0.6037
## 3 0.9257 0.007 15 252 0.6716
## 4 0.4466 0.0379 4 632 0.7663
## 5 0.3739 0.0261 10 427 0.6678
## 6 0.3326 0.0291 19 277 0.8742
## 7 0.4807 0.0334 7 866 0.9561
## 8 0.3513 0.0821 20 486 0.9026
## 9 0.6821 0.0334 14 728 0.6761
## 10 0.8212 0.1342 8 441 0.9764
## model_ids auc
## 1 grid_binomial_gbm_14_model_5 0.7181182460928357
## 2 grid_binomial_gbm_14_model_2 0.715857633046727
## 3 grid_binomial_gbm_14_model_4 0.7130017267038504
## 4 grid_binomial_gbm_14_model_7 0.7113849843006849
## 5 grid_binomial_gbm_14_model_3 0.7042723775961279
## 6 grid_binomial_gbm_14_model_9 0.7028150572108655
## 7 grid_binomial_gbm_14_model_6 0.6933411498699452
## 8 grid_binomial_gbm_14_model_8 0.6854323163003493
## 9 grid_binomial_gbm_14_model_1 0.6843664851640369
## 10 grid_binomial_gbm_14_model_10 0.6753911580397716
gbm_grid_top_model <- get_gbm_grid@summary_table[1, "model_ids"]
gbm_grid_top_model
## [1] "grid_binomial_gbm_14_model_5"
# Sort the grid by CV AUC for XGBOOST
get_xgb_grid <- h2o.getGrid(grid_id = xgb_grid@grid_id, sort_by = "AUC", decreasing = TRUE)
get_xgb_grid
## H2O Grid Details
## ================
##
## Grid ID: grid_binomial_xgb_14
## Used hyper parameters:
## - col_sample_rate
## - learn_rate
## - max_depth
## - ntrees
## - sample_rate
## Number of models: 10
## Number of failed models: 0
##
## Hyper-Parameter Search Summary: ordered by decreasing AUC
## col_sample_rate learn_rate max_depth ntrees sample_rate
## 1 0.6801 0.0616 6 12 0.5436
## 2 0.7238 0.01 9 227 0.6867
## 3 0.9543 0.0446 16 330 0.6907
## 4 0.5262 0.0409 11 793 0.7308
## 5 0.7688 0.0963 15 249 0.6112
## 6 0.5476 0.0998 13 430 0.9742
## 7 0.3023 0.1302 12 723 0.5747
## 8 0.4026 0.0823 13 939 0.9523
## 9 0.6441 0.1299 3 856 0.768
## 10 0.3266 0.1656 13 922 0.6382
## model_ids auc
## 1 grid_binomial_xgb_14_model_4 0.7269703634027106
## 2 grid_binomial_xgb_14_model_10 0.7204998608921451
## 3 grid_binomial_xgb_14_model_1 0.6984243275349646
## 4 grid_binomial_xgb_14_model_5 0.6953767614808142
## 5 grid_binomial_xgb_14_model_2 0.6926791731252456
## 6 grid_binomial_xgb_14_model_6 0.6918023520267793
## 7 grid_binomial_xgb_14_model_8 0.6901940002561351
## 8 grid_binomial_xgb_14_model_7 0.6883440865913276
## 9 grid_binomial_xgb_14_model_3 0.686505213232469
## 10 grid_binomial_xgb_14_model_9 0.6820195369254073
xgb_grid_top_model <- get_xgb_grid@summary_table[1, "model_ids"]
xgb_grid_top_model
## [1] "grid_binomial_xgb_14_model_4"
# Sort the grid by CV AUC for XGBOOST
get_rf_grid <- h2o.getGrid(grid_id = rf_grid@grid_id, sort_by = "AUC", decreasing = TRUE)
get_rf_grid
## H2O Grid Details
## ================
##
## Grid ID: grid_binomial_rf_14
## Used hyper parameters:
## Number of models: 1
## Number of failed models: 0
##
## Hyper-Parameter Search Summary: ordered by decreasing AUC
## model_ids auc
## 1 grid_binomial_rf_14_model_1 0.7211307039740685
rf_grid_top_model <- get_rf_grid@summary_table[1, "model_ids"]
rf_grid_top_model
## [1] "grid_binomial_rf_14_model_1"
# Use AutoML to find a list of candidate models (i.e., leaderboard)
auto_ml <- h2o.automl(
x = x,
y = y,
training_frame = train.data,
nfolds = 5,
max_runtime_secs = 60 * 120,
max_models = 10,
keep_cross_validation_predictions = FALSE,
sort_metric = "auc",
seed = 123,
stopping_rounds = 50,
stopping_metric = "auc",
stopping_tolerance = 0
)
##
|
| | 0%
## 23:17:57.268: Stopping tolerance set by the user is < 70% of the recommended default of 0.018217988943396556, so models may take a long time to converge or may not converge at all.
|
|== | 2%
|
|== | 3%
|
|==== | 5%
|
|==== | 6%
|
|===== | 8%
|
|======= | 10%
|
|======== | 11%
|
|========== | 14%
|
|========== | 15%
|
|=========== | 16%
|
|============ | 17%
|
|============ | 18%
|
|============= | 19%
|
|============== | 19%
|
|============== | 20%
|
|=============== | 22%
|
|================ | 22%
|
|================ | 23%
|
|================= | 24%
|
|================= | 25%
|
|================== | 25%
|
|================== | 26%
|
|=================== | 27%
|
|=================== | 28%
|
|===================== | 31%
|
|======================= | 33%
|
|======================================================================| 100%
# Assess the leader board; the following truncates the results to show the top
# and bottom 15 models. You can get the top model with auto_ml@leader
auto_ml@leaderboard %>%
as.data.frame() %>%
dplyr::select(model_id, auc) %>%
dplyr::slice(1:25)
## model_id auc
## 1 StackedEnsemble_BestOfFamily_AutoML_20200226_231757 0.7370301
## 2 StackedEnsemble_AllModels_AutoML_20200226_231757 0.7367382
## 3 XGBoost_3_AutoML_20200226_231757 0.7325164
## 4 GLM_1_AutoML_20200226_231757 0.7324252
## 5 XGBoost_2_AutoML_20200226_231757 0.7291813
## 6 XGBoost_1_AutoML_20200226_231757 0.7254258
## 7 DRF_1_AutoML_20200226_231757 0.7101999
## 8 GBM_5_AutoML_20200226_231757 0.6974415
## 9 GBM_1_AutoML_20200226_231757 0.6872763
## 10 GBM_2_AutoML_20200226_231757 0.6792325
## 11 GBM_4_AutoML_20200226_231757 0.6751355
## 12 GBM_3_AutoML_20200226_231757 0.6739104